In [23]:
%pylab inline
%load_ext autoreload
%autoreload 2
from __future__ import division
from collections import defaultdict, namedtuple
import cPickle as pickle
from datetime import datetime, timedelta
import dateutil
from functools import partial
import inspect
import json
import os
import re
import sys
import numpy as np
import pandas as pd
In [2]:
if os.name == 'nt':
TRAIN_PATH = r'D:\train.csv'
PTRAIN_PATH = r'D:\train_preprocessed_float_string_date.csv'
TEST_PATH = r'D:\test.csv'
GOOGNEWS_PATH = r'D:\GoogleNews-vectors-negative300.bin.gz'
VOCAB_PATH = r'D:\big.txt'
else:
TRAIN_PATH = r'/media/mtambos/speedy/train.csv'
PTRAIN_PATH = r'/media/mtambos/speedy/train_preprocessed_float_string_date.csv'
TEST_PATH = r'/media/mtambos/speedy/test.csv'
GOOGNEWS_PATH = r'/media/mtambos/speedy/GoogleNews-vectors-negative300.bin.gz'
VOCAB_PATH = r'/media/mtambos/speedy/big.txt'
#df_orig = pd.read_csv(TRAIN_PATH, index_col="ID")
df = pd.read_csv(PTRAIN_PATH, index_col="ID")
#df
In [3]:
date_cols = ['VAR_0073', 'VAR_0075', 'VAR_0156',
'VAR_0157', 'VAR_0158', 'VAR_0159',
'VAR_0166', 'VAR_0167', 'VAR_0168',
'VAR_0169', 'VAR_0176', 'VAR_0177',
'VAR_0178', 'VAR_0179', 'VAR_0204',
'VAR_0217', 'VAR_0294', 'VAR_0314']
In [6]:
def parse_date_str(date_val):
if isinstance(date_val, datetime):
return date_val
date_val = str(date_val).lower()
date_val = None if date_val == 'nan' else date_val
if date_val is None:
return pd.NaT
date_val = date_val[:-2] if date_val[-2:] == '.0' else date_val
try:
return datetime.strptime(date_val, '%d%b%y:%H:%M:%S')
except:
try:
return datetime.strptime(date_val, '%d%b%y')
except:
try:
return datetime.strptime(date_val, '%Y')
except:
print date_val
return pd.NaT
df_date_cols = df[date_cols].applymap(parse_date_str).astype('datetime64[ns]')
df_date_cols.describe()
Out[6]:
In [11]:
df[date_cols] = df_date_cols
In [8]:
df.drop_duplicates(inplace=True)
In [13]:
cols_to_drop = set()
for i, col in enumerate(date_cols):
for col2 in date_cols[i+1:]:
if (df[col] == df[col2]).all():
cols_to_drop.add(col2)
cols_to_drop
Out[13]:
In [14]:
nan_cols = df.isnull().all()
nan_cols = nan_cols.index[nan_cols].tolist()
nan_cols
Out[14]:
In [17]:
df[date_cols] = df[date_cols].astype('datetime64[ns]')
years = pd.DataFrame(columns=[c+'_year' for c in date_cols], index=df.index, dtype=np.int)
months = pd.DataFrame(columns=[c+'_month' for c in date_cols], index=df.index, dtype=np.int)
days = pd.DataFrame(columns=[c+'_day' for c in date_cols], index=df.index, dtype=np.int)
for c in date_cols:
dateIndex = pd.DatetimeIndex(df[c])
years[c+'_year'] = dateIndex.year
months[c+'_month'] = dateIndex.month
days[c+'_day'] = dateIndex.day
In [18]:
df = df.drop(date_cols, axis=1)
df = df.join(years)
df = df.join(months)
df = df.join(days)
In [19]:
date_cols = years.columns.tolist() + months.columns.tolist() + days.columns.tolist()
In [21]:
df_desc = df.describe()
df_desc[sorted(df_desc.columns, key=lambda x: df_desc.loc['std', x])]
Out[21]:
In [27]:
std_series = df_desc.loc['std', :]
null_std_cols = std_series[std_series == 0]
df = df.drop(null_std_cols.index, axis=1)
In [28]:
df.to_csv(PTRAIN_PATH)
In [24]:
with open('date_cols.pickle', 'wb') as fp:
pickle.dump(date_cols, fp)